In [ ]:
import numpy as np
import pandas as pd
In [1]:
column_names = ['txn_key', 'from_user', 'to_user', 'date', 'amount']
df = pd.read_csv('../data/bitcoin_uic_data_and_code_20130410/user_edges.txt', names=column_names)
df.head()
Out[1]:
In [2]:
df[ df.date < 20110000000000 ].to_csv('../data/subset/user_edges_2010.csv', index=False)
In [3]:
df = pd.read_csv('../data/subset/user_edges_2010.csv')
In [16]:
df['date'] = pd.to_datetime(df.date, format='%Y-%m-%d %H:%M:%S')
In [5]:
df.to_csv('../data/subset/user_edges_2010.csv', index=False)
In [17]:
import networkx as nx
# for features only defined in undirected graph
G = nx.from_pandas_dataframe(df,
source='from_user', target='to_user',
edge_attr=['txn_key', 'amount', 'date'],
create_using=nx.Graph()
)
# unique links between users
G_di = nx.from_pandas_dataframe(df,
source='from_user', target='to_user',
edge_attr=['txn_key', 'amount', 'date'],
create_using=nx.DiGraph()
)
# the full graph
G_mdi = nx.from_pandas_dataframe(df,
source='from_user', target='to_user',
edge_attr=['txn_key', 'amount', 'date'],
create_using=nx.MultiDiGraph()
)
In [18]:
# transaction feature maps
count_by_key = df.groupby('txn_key').size()
amount_by_key = df.groupby('txn_key').amount.sum()
ufrom_by_key = df.groupby('txn_key').from_user.agg(pd.Series.nunique)
uto_by_key = df.groupby('txn_key').to_user.agg(pd.Series.nunique)
# user feature maps
in_txn_count = df.groupby('to_user').size()
in_key_count = df.groupby('to_user').txn_key.agg(pd.Series.nunique)
out_txn_count = df.groupby('from_user').size()
out_key_count = df.groupby('from_user').txn_key.agg(pd.Series.nunique)
total_in_txn_amt = df.groupby('to_user').amount.sum()
total_out_txn_amt = df.groupby('from_user').amount.sum()
avg_in_txn_amt = df.groupby('to_user').amount.mean()
avg_out_txn_amt = df.groupby('from_user').amount.mean()
from_fst_txn_date = df.groupby('from_user').date.min()
In [19]:
df_feat = df.assign(
# transaction features
count_by_key = df.txn_key.map(count_by_key),
amount_by_key = df.txn_key.map(amount_by_key),
from_eq_to = df.from_user == df.to_user,
ufrom_by_key = df.txn_key.map(ufrom_by_key),
uto_by_key = df.txn_key.map(uto_by_key),
# transaction date features
date_year = df.date.dt.year,
date_month = df.date.dt.month,
date_day = df.date.dt.day,
date_dayofweek = df.date.dt.dayofweek,
date_dayofyear = df.date.dt.dayofyear,
date_hour = df.date.dt.hour,
date_minute = df.date.dt.minute,
date_second = df.date.dt.second,
# user features
from_in_txn_count = df.from_user.map(in_txn_count),
from_in_key_count = df.from_user.map(in_key_count),
from_out_txn_count = df.from_user.map(out_txn_count),
from_out_key_count = df.from_user.map(out_key_count),
to_in_txn_count = df.to_user.map(in_txn_count),
to_in_key_count = df.to_user.map(in_key_count),
to_out_txn_count = df.to_user.map(out_txn_count),
to_out_key_count = df.to_user.map(out_key_count),
from_total_in_txn_amt = df.from_user.map(total_in_txn_amt),
from_total_out_txn_amt = df.from_user.map(total_out_txn_amt),
to_total_in_txn_amt = df.to_user.map(total_in_txn_amt),
to_total_out_txn_amt = df.to_user.map(total_out_txn_amt),
from_avg_in_txn_amt = df.from_user.map(avg_in_txn_amt),
from_avg_out_txn_amt = df.from_user.map(avg_out_txn_amt),
to_avg_in_txn_amt = df.to_user.map(avg_in_txn_amt),
to_avg_out_txn_amt = df.to_user.map(avg_out_txn_amt),
from_in_deg = df.from_user.map(G_mdi.in_degree()),
from_out_deg = df.from_user.map(G_mdi.out_degree()),
from_in_udeg = df.from_user.map(G_di.in_degree()),
from_out_udeg = df.from_user.map(G_di.out_degree()),
to_in_deg = df.to_user.map(G_mdi.in_degree()),
to_out_deg = df.to_user.map(G_mdi.out_degree()),
to_in_udeg = df.to_user.map(G_di.in_degree()),
to_out_udeg = df.to_user.map(G_di.out_degree()),
from_cc = df.from_user.map(nx.clustering(G)),
to_cc = df.to_user.map(nx.clustering(G))
)
df_feat.fillna(0, inplace=True)
In [20]:
from sklearn.ensemble import IsolationForest
not_train_cols = ['txn_key', 'from_user', 'to_user', 'date']
X_train = df_feat[ [col for col in df_feat.columns if col not in not_train_cols] ].values
clf = IsolationForest(n_estimators=100,
contamination=0.01,
n_jobs=-1, random_state=42)
clf.fit(X_train)
Out[20]:
In [122]:
clf.threshold_
Out[122]:
In [ ]:
pred = clf.predict(X_train)
anomalies = (pred != 1)
In [98]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
sns.distplot(scores, kde=False)
line = plt.vlines(clf.threshold_, 0, 30000, colors='r', linestyles='dotted')
line.set_label('Threshold = -0.0948')
plt.legend(loc='upper left', fontsize='medium')
plt.title('Anomaly Scores returned by Isolation Forest', fontsize=16);
In [126]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, #perplexity=50,
#n_iter=200, n_iter_without_progress=10,
#angle=0.7,
random_state=42)
X_tsne = tsne.fit_transform(X_train[150000:155000])
In [127]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
outlier = anomalies[150000:155000]
plt.figure(figsize=(12,8))
plt.scatter(X_tsne[~outlier][:,0], X_tsne[~outlier][:,1], marker='.', c='b', alpha=.2)
plt.scatter(X_tsne[outlier][:,0], X_tsne[outlier][:,1], marker='o', c='r', alpha=1)
plt.legend(['Normal Transactions', 'Abnormal Transactions'])
plt.title('t-SNE Visualization of Normal Transactions vs Abnormal Transactions', fontsize=16);
In [192]:
json_date_format = '%Y-%m-%dT%H:%M:%SZ'
# df['date'] = df.date.dt.strftime(json_date_format)
for scc in nx.strongly_connected_components(G_di):
if len(scc) > 5:
intersect = np.intersect1d(list(scc), anomalies_id)
if intersect.size > 0:
G_sub = G_di.subgraph(scc)
#G_sub_json = json_graph.node_link_data(G_sub)
#with open('../d3/json/network.json', 'w') as json_file:
# json.dump(G_sub_json, json_file)
In [187]:
anomalies = (pred != 1)
np.concatenate((df[anomalies].from_user, df[anomalies].to_user))
Out[187]:
In [245]:
anomalies_pairs = zip(df[anomalies].from_user, df[anomalies].to_user)
for i, j in anomalies_pairs:
neigh = G_di.neighbors(i)
neigh += G_di.neighbors(j)
nodes = neigh + [i, j]
if len(nodes) > 10 and len(nodes) < 20:
G_sub = nx.subgraph(G_di, nodes)
In [246]:
from networkx.readwrite import json_graph
import json
anomalies_pairs = zip(df[anomalies].from_user, df[anomalies].to_user)
for e in G_sub.edges_iter():
if e[:2] in anomalies_pairs:
G_sub.edge[e[0]][e[1]]['type'] = 'licensing'
else:
G_sub.edge[e[0]][e[1]]['type'] = 'suit'
G_sub_json = json_graph.node_link_data(G_sub)
with open('../d3/json/network.json', 'w') as json_file:
json.dump(G_sub_json, json_file)
In [220]:
G_di = nx.from_pandas_dataframe(df,
source='from_user', target='to_user',
edge_attr=['txn_key', 'amount', 'date'],
create_using=nx.DiGraph()
)
In [1]:
from IPython.display import IFrame
IFrame('./d3/html/network.html', width=1000, height=500)
Out[1]:
In [ ]: